library(conflicted)
conflict_prefer("Position", "base")
library(dplyr)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
library(ggplot2)
library(ggpubr)
library(ggbeeswarm)
library(readr)
library(tidyr)
library(reticulate)
library(clusterProfiler)
library(foreach)
library(doMC)
library(org.Hs.eg.db)
library(cowplot)
library(stringr)
knitr::knit_engines$set(python = reticulate::eng_python)
reticulate::py_available(TRUE)## [1] TRUE
# bug in rstudio/reticulate:
matplotlib <- import("matplotlib")
matplotlib$use("Agg", force = TRUE)
registerDoMC(cores=params$cpus)import scanpy as sc
import matplotlib.pyplot as plt
import matplotlib
import sys
sys.path.extend(("lib", "../lib"))
from jupytertools import *
# setwd()
fix_logging(sc.settings)
matplotlib.rcParams.update({"figure.autolayout": True, "figure.max_open_warning": 0})
adata = sc.read_h5ad(r.params["input_adata"])get_path = function(file_name) {file.path(params$input_de_res_dir, file_name)}
hpv_all = read_tsv(get_path("hpv.rda.res.tsv")) %>% mutate(comparison="all")
hpv_cd4 = read_tsv(get_path("hpv_t_cd4.rda.res.tsv")) %>% mutate(comparison="CD4+ non-reg")
hpv_cd8 = read_tsv(get_path("hpv_t_cd8.rda.res.tsv")) %>% mutate(comparison="CD8+")
hpv_treg = read_tsv(get_path("hpv_t_reg.rda.res.tsv")) %>% mutate(comparison="T reg")
hpv_nk = read_tsv(get_path("hpv_nk.rda.res.tsv")) %>% mutate(comparison="NK")
hpv = bind_rows(hpv_all, hpv_cd4, hpv_cd8, hpv_treg, hpv_nk)
bulk_hpv_all = read_tsv(get_path("bulk_hpv_overall.rda.res.tsv")) %>% mutate(comparison="all")
bulk_hpv_cd4 = read_tsv(get_path("bulk_hpv_t_cd4.rda.res.tsv")) %>% mutate(comparison="CD4+ non-reg")
bulk_hpv_cd8 = read_tsv(get_path("bulk_hpv_t_cd8.rda.res.tsv")) %>% mutate(comparison="CD8+")
bulk_hpv_treg = read_tsv(get_path("bulk_hpv_t_reg.rda.res.tsv")) %>% mutate(comparison="T reg")
bulk_hpv_nk = read_tsv(get_path("bulk_hpv_nk.rda.res.tsv")) %>% mutate(comparison="NK")
bulk_hpv = bind_rows(bulk_hpv_all, bulk_hpv_cd4, bulk_hpv_cd8, bulk_hpv_treg, bulk_hpv_nk)
ir_all = read_tsv(get_path("ir.rda.res.tsv")) %>% mutate(comparison="all")
ir_cd4 = read_tsv(get_path("ir_t_cd4.rda.res.tsv")) %>% mutate(comparison="CD4+ non-reg")
ir_cd8 = read_tsv(get_path("ir_t_cd8.rda.res.tsv")) %>% mutate(comparison="CD8+")
ir_treg = read_tsv(get_path("ir_t_reg.rda.res.tsv")) %>% mutate(comparison="T reg")
ir_nk = read_tsv(get_path("ir_nk.rda.res.tsv")) %>% mutate(comparison="NK")
ir = bind_rows(ir_all, ir_cd4, ir_cd8, ir_treg, ir_nk)
bulk_ir_all = read_tsv(get_path("bulk_ir_overall.rda.res.tsv")) %>% mutate(comparison="all")
bulk_ir_cd4 = read_tsv(get_path("bulk_ir_t_cd4.rda.res.tsv")) %>% mutate(comparison="CD4+ non-reg")
bulk_ir_cd8 = read_tsv(get_path("bulk_ir_t_cd8.rda.res.tsv")) %>% mutate(comparison="CD8+")
bulk_ir_treg = read_tsv(get_path("bulk_ir_t_reg.rda.res.tsv")) %>% mutate(comparison="T reg")
bulk_ir_nk = read_tsv(get_path("bulk_ir_nk.rda.res.tsv")) %>% mutate(comparison="NK")
bulk_ir = bind_rows(bulk_ir_all, bulk_ir_cd4, bulk_ir_cd8, bulk_ir_treg, bulk_ir_nk)
clusters_nk = read_tsv(get_path("cluster_nk.rda.res.tsv")) %>% mutate(cell_type="NK")
clusters_cd4 = read_tsv(get_path("cluster_t_cd4.rda.res.tsv")) %>% mutate(cell_type="CD4+")
clusters_cd8 = read_tsv(get_path("cluster_t_cd8.rda.res.tsv")) %>% mutate(cell_type="CD8+")
clusters = bind_rows(clusters_nk, clusters_cd4, clusters_cd8)
obs = read_tsv(params$input_obs, guess_max=1e6) %>%
separate(cluster, c("ct2", "_id"), remove=FALSE, sep="_")These are teh clusters as determined with the “Leiden” algorithm in the previous step.
We observe that some clusters, in particular CD8_2 and NK_1 are mostly driven by a single patient. While we cannot rule out entirely that this is driven by batch effects, it seems unlikely because: * These patients are outliers for the single cell-type only and integrate well with other patients for other cell types. * While the cluster seems unique for the patient, the respective patient-sample also contains T/NK cells that fall in another cluster.
## Joining, by = c("patient", "hpv_status", "ir_status", "cluster")
## Joining, by = "ct2"
## Joining, by = c("patient", "cluster", "ct2")
The following plots show the 20 most differentially expressed genes for each cluster. Note that the comparisons are within a certain cell-types only (i.e. the CD4+ sub-clusters are compared against other CD4+ subclusters, but not against CD8+ and NK cells).
Here, we compare the abundance of cluster by HPV status. Each point refers to the fraction of cells in a certain cluster that originates from a certain patient. The black point refers to the median, the vertical bar to the median average deviation (MAD).
It appears HPV16+ patients have a higher infiltration of cells of the type CD4_0. Note that this analysis is not ideal, as it depends on the number of cells detected per patient. This analysis assumes that the number of detected NK/T cells per patient is proportional to the true NK/T cell inflitration. This is reasonable, as we obtained the cells from a dataset containing also other cell types – we can assume that the NK/T cell infiltration is higher if they make up a larger fraction of the total cells.
On the other hand, the number of detected cells per sample also depends on the efficacy of FACS pre-sorting and the library preparation. However, these should be random processes and not related to the patient group.
We compare gene expression of HPV+ vs HPV-. We perform the comparision * for all cell-types together (=“overall”) * for each major cell type individually
Note that for CD8+ T cells, the major differences (TRBV23, TRBV27) are mainly driven by the cluster of CD8+ T cells that are only abundant in a single patient (CD8_2).
Red-ish genes are over-represented in HPV16+, blue ones under-represented.
None of the tested genes is statistically significantly different.
The relative abundance of single cell clusters by IR status. The black point refers to the median, the vertical bar to the median average deviation (MAD).
CD8_0 cells appear to be enriched in IR+ patients. Again, note that this analysis assumes that the number of detected NK/T cells correlates with the true NK/T cell infiltration.
## /data/scratch/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/R/library/reticulate/python/rpytools/call.py:13: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
## res = rpycall.call_r_function(f, *args, **kwargs)
Again, it seems that the major differences are mainly driven by a single patient (or two patients…). Here, we summarise the gene expression by patient and will more formally test it using a “bulk” differential gene expression analysis.
| patient | ir_status | hspa1a | hspa1b | cxcl13 |
|---|---|---|---|---|
| H176 | IR+ | 0.6138713 | 0.4415543 | 0.1229076 |
| H68 | IR+ | 0.3509002 | 0.2815898 | 0.1080206 |
| H197 | nan | 0.1328043 | 0.1037808 | 0.1736047 |
| H208 | IR- | 0.0847083 | 0.0395202 | 0.0709306 |
| H141 | IR+ | 0.0689916 | 0.0449480 | 0.0212808 |
| H160 | IR+ | 0.0680929 | 0.0563161 | 0.1222577 |
| H188 | IR+ | 0.0658586 | 0.0398949 | 0.0928682 |
| H143 | nan | 0.0557052 | 0.0591755 | 0.0540268 |
| H205 | nan | 0.0499548 | 0.0314365 | 0.0519849 |
| H185 | IR+ | 0.0488256 | 0.0460451 | 0.0944863 |
| H149 | IR- | 0.0464720 | 0.0479647 | 0.0280023 |
| H211 | IR- | 0.0324986 | 0.0180906 | 0.0273170 |
| H182 | IR- | 0.0303805 | 0.0113038 | 0.0695021 |
The enrichment of HSPA1A and CCL4 holds even in the bulk model. Yet, it is not statistically significant.
I retrieved a list of NK receptors from https://www.frontiersin.org/articles/10.3389/fimmu.2015.00601/full and mapped them to gene symbols manually using genecards.org.
nkrs = [
"TNFSF10", #TRAIL
"FCGR3A", #CD16
"NCR3", #NKp30a,b
"KLRC2", #NKG2C
"KLRK1", #NKG2D
"CD244", #2B4
"CD226", #DNAM-1
"KLRC1", #NKG2A
"KIR3DL1", #KIR
"TNFRSF9", #CD137/41BB
"TNFRSF4", #OX40
"CD27", #CD27
]adata_cd8 = adata[adata.obs["cell_type"] == "T cell CD8+", :]
sc.pp.highly_variable_genes(adata_cd8, flavor="cell_ranger", n_top_genes=2000)## extracting highly variable genes
## finished
## added
## 'highly_variable', boolean vector (adata.var)
## 'means', float vector (adata.var)
## 'dispersions', float vector (adata.var)
## 'dispersions_norm', float vector (adata.var)
## Trying to set attribute `.var` of view, making a copy.
## computing neighbors
## using 'X_pca' with n_pcs = 50
## finished
## computing UMAP
## finished
## /data/scratch/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/R/library/reticulate/python/rpytools/call.py:13: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
## res = rpycall.call_r_function(f, *args, **kwargs)